{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Analytics Vidhya: Practice Problem (Approach)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ls: cannot access '/home/pratos/Side-Project/av_articles/data/': No such file or directory\r\n"
     ]
    }
   ],
   "source": [
    "!ls /home/pratos/Side-Project/av_articles/data/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download the training & test data from the Practice Problem approach. We'll do a bit of quick investigation on the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('../data/training.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Loan_ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Married</th>\n",
       "      <th>Dependents</th>\n",
       "      <th>Education</th>\n",
       "      <th>Self_Employed</th>\n",
       "      <th>ApplicantIncome</th>\n",
       "      <th>CoapplicantIncome</th>\n",
       "      <th>LoanAmount</th>\n",
       "      <th>Loan_Amount_Term</th>\n",
       "      <th>Credit_History</th>\n",
       "      <th>Property_Area</th>\n",
       "      <th>Loan_Status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LP001002</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>5849</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LP001003</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>1</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>4583</td>\n",
       "      <td>1508.0</td>\n",
       "      <td>128.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Rural</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LP001005</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>Yes</td>\n",
       "      <td>3000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LP001006</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>Not Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>2583</td>\n",
       "      <td>2358.0</td>\n",
       "      <td>120.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LP001008</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>6000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>141.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Loan_ID Gender Married Dependents     Education Self_Employed  \\\n",
       "0  LP001002   Male      No          0      Graduate            No   \n",
       "1  LP001003   Male     Yes          1      Graduate            No   \n",
       "2  LP001005   Male     Yes          0      Graduate           Yes   \n",
       "3  LP001006   Male     Yes          0  Not Graduate            No   \n",
       "4  LP001008   Male      No          0      Graduate            No   \n",
       "\n",
       "   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \\\n",
       "0             5849                0.0         NaN             360.0   \n",
       "1             4583             1508.0       128.0             360.0   \n",
       "2             3000                0.0        66.0             360.0   \n",
       "3             2583             2358.0       120.0             360.0   \n",
       "4             6000                0.0       141.0             360.0   \n",
       "\n",
       "   Credit_History Property_Area Loan_Status  \n",
       "0             1.0         Urban           Y  \n",
       "1             1.0         Rural           N  \n",
       "2             1.0         Urban           Y  \n",
       "3             1.0         Urban           Y  \n",
       "4             1.0         Urban           Y  "
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of the data is:(614, 13)\n"
     ]
    }
   ],
   "source": [
    "print(\"Shape of the data is:{}\".format(data.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "List of columns is: ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']\n"
     ]
    }
   ],
   "source": [
    "print(\"List of columns is: {}\".format(list(data.columns)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here, `Loan_status` is our `target variable`, the rest are `predictor variables`. `Loan_ID` wouldn't help much in making predictions about `defaulters` hence we won't be considering that variable in our final model."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finding out the `null/Nan` values in the columns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of null values in:Loan_ID == 0\n",
      "The number of null values in:Gender == 13\n",
      "The number of null values in:Married == 3\n",
      "The number of null values in:Dependents == 15\n",
      "The number of null values in:Education == 0\n",
      "The number of null values in:Self_Employed == 32\n",
      "The number of null values in:ApplicantIncome == 0\n",
      "The number of null values in:CoapplicantIncome == 0\n",
      "The number of null values in:LoanAmount == 22\n",
      "The number of null values in:Loan_Amount_Term == 14\n",
      "The number of null values in:Credit_History == 50\n",
      "The number of null values in:Property_Area == 0\n",
      "The number of null values in:Loan_Status == 0\n"
     ]
    }
   ],
   "source": [
    "for _ in data.columns:\n",
    "    print(\"The number of null values in:{} == {}\".format(_, data[_].isnull().sum()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll check out the values (labels) for the columns having missing values:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "List of unique labels for Dependents:::{nan, '3+', '0', '2', '1'}\n",
      "List of unique labels for Self_Employed:::{nan, 'Yes', 'No'}\n",
      "List of unique labels for Loan_Amount_Term:::{nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 12.0, 36.0, 300.0, 180.0, 60.0, 84.0, 480.0, 360.0, 240.0, 120.0}\n",
      "List of unique labels for Gender:::{'Male', nan, 'Female'}\n",
      "List of unique labels for Married:::{nan, 'Yes', 'No'}\n"
     ]
    }
   ],
   "source": [
    "missing_pred = ['Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Gender', 'Married']\n",
    "\n",
    "for _ in missing_pred:\n",
    "    print(\"List of unique labels for {}:::{}\".format(_, set(data[_])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "For the rest of the missing values:\n",
    "\n",
    "- `Dependents`: Assumption that there are no dependents\n",
    "- `Self_Employed`: Assumption that the applicant is not self-employed\n",
    "- `Loan_Amount_Term`: Assumption that the loan amount term is median value\n",
    "- `Credit_History`: Assumption that the person has a credit history\n",
    "- `Married`: If nothing specified, applicant is not married\n",
    "- `Gender`: Assuming the gender is Male for the missing values\n",
    "\n",
    "Before that we'll divide the dataset in train and test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Loan_ID',\n",
       " 'Gender',\n",
       " 'Married',\n",
       " 'Dependents',\n",
       " 'Education',\n",
       " 'Self_Employed',\n",
       " 'ApplicantIncome',\n",
       " 'CoapplicantIncome',\n",
       " 'LoanAmount',\n",
       " 'Loan_Amount_Term',\n",
       " 'Credit_History',\n",
       " 'Property_Area',\n",
       " 'Loan_Status']"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(data.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\\\n",
    "            'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(data[pred_var], data['Loan_Status'], \\\n",
    "                                                    test_size=0.25, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll compile a list of `pre-processing` steps that we do on to create a custom `estimator`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train['Dependents'] = X_train['Dependents'].fillna('0')\n",
    "X_train['Self_Employed'] = X_train['Self_Employed'].fillna('No')\n",
    "X_train['Loan_Amount_Term'] = X_train['Loan_Amount_Term'].fillna(X_train['Loan_Amount_Term'].mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train['Credit_History'] = X_train['Credit_History'].fillna(1)\n",
    "X_train['Married'] = X_train['Married'].fillna('No')\n",
    "X_train['Gender'] = X_train['Gender'].fillna('Male')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train['LoanAmount'] = X_train['LoanAmount'].fillna(X_train['LoanAmount'].mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We have a lot of `string` labels that we encounter in `Gender`, `Married`, `Education`, `Self_Employed` & `Property_Area` columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "List of unique labels Gender:{'Male', 'Female'}\n",
      "List of unique labels Married:{'Yes', 'No'}\n",
      "List of unique labels Education:{'Not Graduate', 'Graduate'}\n",
      "List of unique labels Self_Employed:{'Yes', 'No'}\n",
      "List of unique labels Property_Area:{'Semiurban', 'Rural', 'Urban'}\n",
      "List of unique labels Dependents:{'3+', '0', '2', '1'}\n"
     ]
    }
   ],
   "source": [
    "label_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']\n",
    "\n",
    "for _ in label_columns:\n",
    "    print(\"List of unique labels {}:{}\".format(_, set(X_train[_])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gender_values = {'Female' : 0, 'Male' : 1} \n",
    "married_values = {'No' : 0, 'Yes' : 1}\n",
    "education_values = {'Graduate' : 0, 'Not Graduate' : 1}\n",
    "employed_values = {'No' : 0, 'Yes' : 1}\n",
    "property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}\n",
    "dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}\n",
    "X_train.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \\\n",
    "                'Self_Employed': employed_values, 'Property_Area': property_values, 'Dependents': dependent_values}\\\n",
    "                , inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>Married</th>\n",
       "      <th>Dependents</th>\n",
       "      <th>Education</th>\n",
       "      <th>Self_Employed</th>\n",
       "      <th>ApplicantIncome</th>\n",
       "      <th>CoapplicantIncome</th>\n",
       "      <th>LoanAmount</th>\n",
       "      <th>Loan_Amount_Term</th>\n",
       "      <th>Credit_History</th>\n",
       "      <th>Property_Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3273</td>\n",
       "      <td>1820.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>304</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4000</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7100</td>\n",
       "      <td>0.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4950</td>\n",
       "      <td>0.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>211</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3430</td>\n",
       "      <td>1250.0</td>\n",
       "      <td>128.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \\\n",
       "92        1        1           2          1              0             3273   \n",
       "304       1        0           0          0              0             4000   \n",
       "68        1        1           3          1              1             7100   \n",
       "15        1        0           0          0              0             4950   \n",
       "211       1        1           3          0              0             3430   \n",
       "\n",
       "     CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \\\n",
       "92              1820.0        81.0             360.0             1.0   \n",
       "304             2500.0       140.0             360.0             1.0   \n",
       "68                 0.0       125.0              60.0             1.0   \n",
       "15                 0.0       125.0             360.0             1.0   \n",
       "211             1250.0       128.0             360.0             0.0   \n",
       "\n",
       "     Property_Area  \n",
       "92               1  \n",
       "304              0  \n",
       "68               1  \n",
       "15               1  \n",
       "211              2  "
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Gender                 int64\n",
       "Married                int64\n",
       "Dependents             int64\n",
       "Education              int64\n",
       "Self_Employed          int64\n",
       "ApplicantIncome        int64\n",
       "CoapplicantIncome    float64\n",
       "LoanAmount           float64\n",
       "Loan_Amount_Term     float64\n",
       "Credit_History       float64\n",
       "Property_Area          int64\n",
       "dtype: object"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of null values in:Gender == 0\n",
      "The number of null values in:Married == 0\n",
      "The number of null values in:Dependents == 0\n",
      "The number of null values in:Education == 0\n",
      "The number of null values in:Self_Employed == 0\n",
      "The number of null values in:ApplicantIncome == 0\n",
      "The number of null values in:CoapplicantIncome == 0\n",
      "The number of null values in:LoanAmount == 16\n",
      "The number of null values in:Loan_Amount_Term == 0\n",
      "The number of null values in:Credit_History == 0\n",
      "The number of null values in:Property_Area == 0\n"
     ]
    }
   ],
   "source": [
    "for _ in X_train.columns:\n",
    "    print(\"The number of null values in:{} == {}\".format(_, X_train[_].isnull().sum()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Converting the pandas dataframes to numpy arrays:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = X_train.as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(460, 11)"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll create a custom `pre-processing estimator` that would help us in writing better pipelines and in future deployments:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "\n",
    "class PreProcessing(BaseEstimator, TransformerMixin):\n",
    "    \"\"\"Custom Pre-Processing estimator for our use-case\n",
    "    \"\"\"\n",
    "\n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def transform(self, df):\n",
    "        \"\"\"Regular transform() that is a help for training, validation & testing datasets\n",
    "           (NOTE: The operations performed here are the ones that we did prior to this cell)\n",
    "        \"\"\"\n",
    "        pred_var = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome',\\\n",
    "            'LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']\n",
    "        \n",
    "        df = df[pred_var]\n",
    "        \n",
    "        df['Dependents'] = df['Dependents'].fillna(0)\n",
    "        df['Self_Employed'] = df['Self_Employed'].fillna('No')\n",
    "        df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(self.term_mean_)\n",
    "        df['Credit_History'] = df['Credit_History'].fillna(1)\n",
    "        df['Married'] = df['Married'].fillna('No')\n",
    "        df['Gender'] = df['Gender'].fillna('Male')\n",
    "        df['LoanAmount'] = df['LoanAmount'].fillna(self.amt_mean_)\n",
    "        \n",
    "        gender_values = {'Female' : 0, 'Male' : 1} \n",
    "        married_values = {'No' : 0, 'Yes' : 1}\n",
    "        education_values = {'Graduate' : 0, 'Not Graduate' : 1}\n",
    "        employed_values = {'No' : 0, 'Yes' : 1}\n",
    "        property_values = {'Rural' : 0, 'Urban' : 1, 'Semiurban' : 2}\n",
    "        dependent_values = {'3+': 3, '0': 0, '2': 2, '1': 1}\n",
    "        df.replace({'Gender': gender_values, 'Married': married_values, 'Education': education_values, \\\n",
    "                    'Self_Employed': employed_values, 'Property_Area': property_values, \\\n",
    "                    'Dependents': dependent_values}, inplace=True)\n",
    "        \n",
    "        return df.as_matrix()\n",
    "\n",
    "    def fit(self, df, y=None, **fit_params):\n",
    "        \"\"\"Fitting the Training dataset & calculating the required values from train\n",
    "           e.g: We will need the mean of X_train['Loan_Amount_Term'] that will be used in\n",
    "                transformation of X_test\n",
    "        \"\"\"\n",
    "        \n",
    "        self.term_mean_ = df['Loan_Amount_Term'].mean()\n",
    "        self.amt_mean_ = df['LoanAmount'].mean()\n",
    "        return self"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To make sure that this works, let's do a test run for it:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(data[pred_var], data['Loan_Status'], \\\n",
    "                                                    test_size=0.25, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Gender</th>\n",
       "      <th>Married</th>\n",
       "      <th>Dependents</th>\n",
       "      <th>Education</th>\n",
       "      <th>Self_Employed</th>\n",
       "      <th>ApplicantIncome</th>\n",
       "      <th>CoapplicantIncome</th>\n",
       "      <th>LoanAmount</th>\n",
       "      <th>Loan_Amount_Term</th>\n",
       "      <th>Credit_History</th>\n",
       "      <th>Property_Area</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>92</th>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>2</td>\n",
       "      <td>Not Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>3273</td>\n",
       "      <td>1820.0</td>\n",
       "      <td>81.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>304</th>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>4000</td>\n",
       "      <td>2500.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Rural</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>68</th>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>3+</td>\n",
       "      <td>Not Graduate</td>\n",
       "      <td>Yes</td>\n",
       "      <td>7100</td>\n",
       "      <td>0.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>4950</td>\n",
       "      <td>0.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>211</th>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>3+</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>3430</td>\n",
       "      <td>1250.0</td>\n",
       "      <td>128.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Semiurban</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Gender Married Dependents     Education Self_Employed  ApplicantIncome  \\\n",
       "92    Male     Yes          2  Not Graduate            No             3273   \n",
       "304   Male      No          0      Graduate            No             4000   \n",
       "68    Male     Yes         3+  Not Graduate           Yes             7100   \n",
       "15    Male      No          0      Graduate            No             4950   \n",
       "211   Male     Yes         3+      Graduate            No             3430   \n",
       "\n",
       "     CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \\\n",
       "92              1820.0        81.0             360.0             1.0   \n",
       "304             2500.0       140.0             360.0             1.0   \n",
       "68                 0.0       125.0              60.0             1.0   \n",
       "15                 0.0       125.0             360.0             1.0   \n",
       "211             1250.0       128.0             360.0             0.0   \n",
       "\n",
       "    Property_Area  \n",
       "92          Urban  \n",
       "304         Rural  \n",
       "68          Urban  \n",
       "15          Urban  \n",
       "211     Semiurban  "
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The number of null values in:Gender == 11\n",
      "The number of null values in:Married == 1\n",
      "The number of null values in:Dependents == 11\n",
      "The number of null values in:Education == 0\n",
      "The number of null values in:Self_Employed == 20\n",
      "The number of null values in:ApplicantIncome == 0\n",
      "The number of null values in:CoapplicantIncome == 0\n",
      "The number of null values in:LoanAmount == 16\n",
      "The number of null values in:Loan_Amount_Term == 11\n",
      "The number of null values in:Credit_History == 36\n",
      "The number of null values in:Property_Area == 0\n"
     ]
    }
   ],
   "source": [
    "for _ in X_train.columns:\n",
    "    print(\"The number of null values in:{} == {}\".format(_, X_train[_].isnull().sum()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "preprocess = PreProcessing()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreProcessing()"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreProcessing()"
      ]
     },
     "execution_count": 123,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocess.fit(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_transformed = preprocess.transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(460, 11)"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_transformed.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So our small experiment to write a custom `estimator` worked. This would be helpful further."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_test_transformed = preprocess.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(154, 11)"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test_transformed.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_test = y_test.replace({'Y':1, 'N':0}).as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = y_train.replace({'Y':1, 'N':0}).as_matrix()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "param_grid = {\"randomforestclassifier__n_estimators\" : [10, 20, 30],\n",
    "             \"randomforestclassifier__max_depth\" : [None, 6, 8, 10],\n",
    "             \"randomforestclassifier__max_leaf_nodes\": [None, 5, 10, 20], \n",
    "             \"randomforestclassifier__min_impurity_split\": [0.1, 0.2, 0.3]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "pipe = make_pipeline(PreProcessing(),\n",
    "                    RandomForestClassifier())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(memory=None,\n",
       "     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False))])"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "\n",
    "grid = GridSearchCV(pipe, param_grid=param_grid, cv=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impu..._jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False))]),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid={'randomforestclassifier__n_estimators': [10, 20, 30], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3], 'randomforestclassifier__max_depth': [None, 6, 8, 10], 'randomforestclassifier__max_leaf_nodes': [None, 5, 10, 20]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 134,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(data[pred_var], data['Loan_Status'], \\\n",
    "                                                    test_size=0.25, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('preprocessing', PreProcessing()), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impu..._jobs=1,\n",
       "            oob_score=False, random_state=None, verbose=0,\n",
       "            warm_start=False))]),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid={'randomforestclassifier__n_estimators': [10, 20, 30], 'randomforestclassifier__min_impurity_split': [0.1, 0.2, 0.3], 'randomforestclassifier__max_depth': [None, 6, 8, 10], 'randomforestclassifier__max_leaf_nodes': [None, 5, 10, 20]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 136,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grid.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best parameters: {'randomforestclassifier__max_leaf_nodes': 20, 'randomforestclassifier__min_impurity_split': 0.2, 'randomforestclassifier__max_depth': 8, 'randomforestclassifier__n_estimators': 30}\n"
     ]
    }
   ],
   "source": [
    "print(\"Best parameters: {}\".format(grid.best_params_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test set score: 0.78\n"
     ]
    }
   ],
   "source": [
    "print(\"Test set score: {:.2f}\".format(grid.score(X_test, y_test)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
